{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 推荐系统 on MovieLens的数据集"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 用户和Movis关联关系处理（训练数据）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#保存数据\n",
    "import cPickle\n",
    "\n",
    "import itertools\n",
    "\n",
    "#处理事件字符串\n",
    "import datetime\n",
    "\n",
    "import numpy as np\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "\n",
    "#用户数目和电影数目\n",
    "from utils import n_Users, n_Movies\n",
    "\n",
    "from collections import defaultdict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "finished reading train & test\n"
     ]
    }
   ],
   "source": [
    " \"\"\"\n",
    "u1.base/u1.test 有4列：\n",
    "user：用户ID\n",
    "movieid：movieID\n",
    "rating：用户打分\n",
    "timestamp：表示用户打分的时间，unix seconds since 1/1/1970 UTC\n",
    "\"\"\"\n",
    "\n",
    "#统计每个用户参加的事件   / 每个事件参加的用户\n",
    "moviesForUser = defaultdict(set)\n",
    "usersForMovie = defaultdict(set)\n",
    "    \n",
    "for filename in [\"u1.base\", \"u1.test\"]:\n",
    "    #user id | item id | rating | timestamp\n",
    "    for line in open(filename,'r'):  #对每条记录\n",
    "        (user,movieid,rating,ts)=line.split('\\t')\n",
    "        \n",
    "        user_int = int(user)-1   #索引从0开始\n",
    "        movieid_int = int(movieid)-1\n",
    "               \n",
    "        #倒排表\n",
    "        moviesForUser[user_int].add(movieid_int)    #该用户对这个电影进行了打分\n",
    "        usersForMovie[movieid_int].add(user_int)    #该事件被用户参加\n",
    "\n",
    "##统计每个用户参加的事件\n",
    "cPickle.dump(moviesForUser, open(\"PE_moviesForUser.pkl\", 'wb'))\n",
    "##统计参加事件的用户\n",
    "cPickle.dump(usersForMovie, open(\"PE_usersForMoive.pkl\", 'wb'))\n",
    "\n",
    "#用户-电影关系矩阵表（打分），可用于后续LFM/SVD++处理的输入\n",
    "#这是一个稀疏矩阵，记录用户对每个电影的打分\n",
    "userMovieScores = ss.dok_matrix((n_Users, n_Movies))\n",
    "    \n",
    "#从训练文件读取数据\n",
    "filename = \"u1.base\"\n",
    "#user id | item id | rating | timestamp\n",
    "for line in open(filename,'r'):  #对每条记录\n",
    "    (user,movieid,rating,ts)=line.split('\\t')\n",
    "    \n",
    "    #下标从0开始\n",
    "    user_int = int(user)-1   #索引从0开始\n",
    "    movieid_int = int(movieid)-1\n",
    "\n",
    "    userMovieScores[user_int, movieid_int] = float(rating)\n",
    " \n",
    "#保存用户-事件关系矩阵R，以备后用\n",
    "sio.mmwrite(\"PE_userMovieScores\", userMovieScores)\n",
    "\n",
    "\n",
    "# 为了防止不必要的计算，我们找出来所有关联的用户 或者 关联的event\n",
    "# 所谓的关联用户，指的是至少在同一个event上有行为的用户pair\n",
    "# 关联的event指的是至少同一个user有行为的event pair\n",
    "uniqueUserPairs = set()\n",
    "uniqueMoviePairs = set()\n",
    "for movie in range(n_Movies):\n",
    "    users = usersForMovie[movie]\n",
    "    if len(users) > 2:\n",
    "        uniqueUserPairs.update(itertools.combinations(users, 2))\n",
    "        \n",
    "for user in range(n_Users):\n",
    "    movies = moviesForUser[user]\n",
    "    if len(movies) > 2:\n",
    "        uniqueMoviePairs.update(itertools.combinations(movies, 2))\n",
    " \n",
    "#保存用户-事件关系对索引表\n",
    "cPickle.dump(uniqueUserPairs, open(\"FE_uniqueUserPairs.pkl\", 'wb'))\n",
    "cPickle.dump(uniqueMoviePairs, open(\"PE_uniqueMoviePairs.pkl\", 'wb'))\n",
    "\n",
    "print \"finished reading train & test\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
